stopifnot(packageVersion("quanteda") >= "1.5")
#stopifnot(packageVersion("LSS") >= "0.7.5")
require(quanteda)
require(stringi)
require(LSX)
require(maps)
require(ggplot2)
require(countrycode)
require(quanteda.textmodels)

quanteda_options(threads = 60)

date <- seq(as.Date("2019-01-01"), as.Date("2019-12-31"), by = "1 day")
month_ar <- c(unique(stri_datetime_format(date, "LLL", locale = "ar")),
              unique(stri_datetime_format(date, "LLLL", locale = "ar")))

dict <- dictionary(file = "dictionary_ar.yml")
dict2 <- dictionary(file = "dictionary2.yml")
dict3 <- dictionary(file = "dictionary3.yml")


tokenize_arabic <- function(corp) {
    toks <- tokens(corp, remove_url = TRUE)
    toks <- tokens_select(toks, "^[\\p{Nd}\\p{L}\\p{Emoji_Presentation}\\-']+$", valuetype = "regex", case_insensitive = FALSE,
                          padding = TRUE) # remove symbols
    return(toks)
}

plot_lss <- function(x, lss_var, select = NULL, span = 0.1, col = "black", add = FALSE, se = TRUE,
                     from = as.Date("2014-01-01"), to = as.Date("2023-12-31"), month = TRUE, 
                     ylim = c(-1.8, 1.8), ...) {
    if (month) {
        par(mar = c(3.1, 4.1, 1.1, 1.1))
    } else {
        par(mar = c(2.1, 4.1, 1.1, 1.1))
    }
    if (!is.null(select))
        x <- subset(x, publication == select)
    pred <- LSX::smooth_lss(
        x, lss_var = lss_var, span = span, 
        from = from, to = to, engine = "locfit"
    )
    if (!add) {
        plot(pred$date, pred$fit, col = rgb(0, 0, 0, 0.05), pch = 16, ylim = ylim,
             xlab = "", ylab = "Framing", type = "n", xaxt = "n", xlim = c(from, to))
        grid()
        abline(h = 0, lty = c(1, 2))
        abline(h = 0)
        
        date <- seq.Date(from, to, by = "1 month")
        is_jan <- format(date, "%m") == "01"
        if (month) {
            axis(1, date, format(date, "%m"))
            axis(1, date[is_jan], format(date[is_jan], "%Y"), line = 1, tick = FALSE)
        } else {
            axis(1, date[is_jan], format(date[is_jan], "%Y"), tick = TRUE)
        }
    }
    lines(pred$date, pred$fit, type = "l", col = col, ...)
    if (se) {
        lines(pred$date, pred$fit + pred$se.fit * 1.96, type = "l", col = adjustcolor(col, 0.2), ...)
        lines(pred$date, pred$fit - pred$se.fit * 1.96, type = "l", col = adjustcolor(col, 0.2), ...)
    }
}


plot_lss2 <- function(x, lss_var, select = NULL, span = 0.1, col = "black", add = FALSE, se = TRUE,
                     from = as.Date("2014-01-01"), to = as.Date("2021-04-01"), month = TRUE, 
                     ylim = c(-1.0, 1.0), ...) {
  if (month) {
    par(mar = c(3.1, 4.1, 1.1, 1.1))
  } else {
    par(mar = c(2.1, 4.1, 1.1, 1.1))
  }
  if (!is.null(select))
    x <- subset(x, publication == select)
  pred <- LSX::smooth_lss(
    x, lss_var = lss_var, span = span, 
    from = from, to = to, engine = "locfit"
  )
  if (!add) {
    plot(pred$date, pred$fit, col = rgb(0, 0, 0, 0.05), pch = 16, ylim = ylim,
         xlab = "", ylab = "Framing", type = "n", xaxt = "n", xlim = c(from, to))
    grid()
    abline(h = 0, lty = c(1, 2))
    abline(h = 0)
    
    date <- seq.Date(from, to, by = "1 month")
    is_jan <- format(date, "%m") == "01"
    if (month) {
      axis(1, date, format(date, "%m"))
      axis(1, date[is_jan], format(date[is_jan], "%Y"), line = 1, tick = FALSE)
    } else {
      axis(1, date[is_jan], format(date[is_jan], "%Y"), tick = TRUE)
    }
  }
  lines(pred$date, pred$fit, type = "l", col = col, ...)
  if (se) {
    lines(pred$date, pred$fit + pred$se.fit * 1.96, type = "l", col = adjustcolor(col, 0.2), ...)
    lines(pred$date, pred$fit - pred$se.fit * 1.96, type = "l", col = adjustcolor(col, 0.2), ...)
  }
}

plot_lss3 <- function(x, lss_var, select = NULL, span = 0.1, col = "black", add = FALSE, se = TRUE,
                     from = as.Date("2016-10-18"), to = as.Date("2020-12-30"), month = TRUE, 
                     ylim = c(-1.0, 1.0), ...) {
  if (month) {
    par(mar = c(3.1, 4.1, 1.1, 1.1))
  } else {
    par(mar = c(2.1, 4.1, 1.1, 1.1))
  }
  if (!is.null(select))
    x <- subset(x, publication == select)
  pred <- LSX::smooth_lss(
    x, lss_var = lss_var, span = span, 
    from = from, to = to, engine = "locfit"
  )
  if (!add) {
    plot(pred$date, pred$fit, col = rgb(0, 0, 0, 0.05), pch = 16, ylim = ylim,
         xlab = "", ylab = "Framing", type = "n", xaxt = "n", xlim = c(from, to))
    grid()
    abline(h = 0, lty = c(1, 2))
    abline(h = 0)
    
    date <- seq.Date(from, to, by = "1 month")
    is_jan <- format(date, "%m") == "01"
    if (month) {
      axis(1, date, format(date, "%m"))
      axis(1, date[is_jan], format(date[is_jan], "%Y"), line = 1, tick = FALSE)
    } else {
      axis(1, date[is_jan], format(date[is_jan], "%Y"), tick = TRUE)
    }
  }
  lines(pred$date, pred$fit, type = "l", col = col, ...)
  if (se) {
    lines(pred$date, pred$fit + pred$se.fit * 1.96, type = "l", col = adjustcolor(col, 0.2), ...)
    lines(pred$date, pred$fit - pred$se.fit * 1.96, type = "l", col = adjustcolor(col, 0.2), ...)
  }
}


plot_volume <- function(x, select = NULL, m = c(10, 10), add = FALSE, ...) {
    date <- seq(as.Date("2014-01-01"), 
                as.Date("2020-12-31"), by = "day")
    par(mar = c(2.1, 4.1, 1.1, 1.1))
    if (!is.null(select))
        x <- subset(x, publication == select)
    freq <- as.matrix(table(factor(as.character(x$date), 
                                   levels = as.character(date))))
    smooth <- kernapply(freq, kernel("daniell", m))
    if (!add) {
        plot(as.Date(rownames(smooth)), smooth, lty = 1, type = "n", 
             ylab = "Volume", ...)
        grid()
    }
    lines(as.Date(rownames(smooth)), smooth, lty = 1, type = "l", ...)
}

add_events <- function(x, bottom = TRUE) {
    event <- unlist(x)
    abline(v = as.Date(event), lty = 3)
    if (bottom) {
        text(as.Date(event), par("usr")[3] + (par("usr")[4] - par("usr")[3]) * 0.02, 
             names(event), srt = 90, adj = 0, pos = 4)
    } else {
        text(as.Date(event), par("usr")[4] - (par("usr")[4] - par("usr")[3]) * 0.02  , 
             names(event), srt = 90, adj = 0, pos = 2)
    }
    
}

var_time <- function(x, lss_var = "lss", time_var = "date") {
    var(sapply(split(dat[[lss_var]], dat[[time_var]]), mean, na.rm = TRUE), na.rm = TRUE)
}

add_crisis <- function(x) {
    x$crisis <- NA
    x$crisis[x$date <= as.Date("2014-06-09")] <- "pre"
    x$crisis[as.Date("2014-06-10") <= x$date & x$date <= as.Date("2017-12-09")] <- "peri"
    x$crisis[as.Date("2017-12-10") <= x$date] <- "post"
    x$crisis <- factor(x$crisis, levels = c("pre", "peri", "post"))
    return(x)
}

export_texts <- function(x, file, ..., n = 50) {
    x <- cbind(docvars(x), text = texts(x))
    x <- subset(x, ...)
    x <- x[sample(seq_len(nrow(x)), min(n, nrow(x))),]
    x <- x[order(x$date),]
    cat("---\n", 
        "css: ../style.css\n", 
        "---\n\n",
        paste0(sprintf("<h1>%s</h1>\n\n<h2><span>%s</span> (%s, %0.2f, %0.2f)</h2>\n\n", 
                       x$head, x$date, x$docid, x$lss, x$confidence.fit),
               x$text, collapse = "\n\n"), 
        file = file, sep = ""
    )
    rmarkdown::render(file)
    file.remove(file)
}

plot_eurpe <- function(x, lss_var, n = 20) {
    
    tb <- table(x$class)
    x <- subset(x, class %in% names(tail(sort(tb), n)))
    
    world_map <- map_data(map = "world")
    world_map <- subset(world_map, region != "Antarctica")
    world_map$region <- iso.alpha(world_map$region, 2) # convert contry name to ISO code
    
    x$class <- stri_trans_toupper(stri_sub(x$class, -2, -1))
    x$id <- factor(x$class, levels = unique(world_map$region))
    temp <- aggregate(list(score = x[[lss_var]]), by = list(id = x$id), 
                      mean, na.rm = TRUE, drop = FALSE)
    
    ggplot(temp) +
        geom_map(aes(fill = score, map_id = id), map = world_map) +
        expand_limits(x = world_map$long, y = world_map$lat) +
        scale_fill_continuous(name = "Intensity", na.value = "lightgray", limits = c(-2, 2)) +
        theme_void() +
        coord_fixed(xlim = c(-9, 70),
                    ylim = c(20, 70),
                    ratio = 1)
}

as.markdown <- function(x, field = c("class", "confidence.fit", "lss_security"), 
                        heading = "head", 
                        sort_by = "date", n = 20, digits = 3) {
    x <- corpus_sample(x, min(ndoc(x), n))
    if (!is.null(sort_by))
        x <- x[order(docvars(x, sort_by))]
    var <- do.call(paste, lapply(docvars(x, field), format, digits = digits))
    cat(paste0("### ", docvars(x, heading), "\n\n" ,
               "*", docnames(x), " ", docvars(x, "date"), "*\n\n", 
               "`", var, "`\n\n" ,
               texts(x), "\n\n\n"), sep = "")
}

add_geography <- function(x) {
    
    x$near <- stri_detect_regex(x$class, "^(europe|africa\\.north|asia\\.west)")
    x$me <- stri_startswith_fixed(x$class, "asia.west") |
            stri_startswith_fixed(x$class, "africa.north")
    x$europe <- stri_startswith_fixed(x$class, "europe") & 
                !stri_startswith_fixed(x$class, "europe.north.gb")
    x$gb <- stri_startswith_fixed(x$class, "europe.north.gb")
    x$de <- stri_startswith_fixed(x$class, "europe.west.de")
    x$fr <- stri_startswith_fixed(x$class, "europe.west.fr")
    x$region <- stri_match_first_regex(x$class, "\\w+\\.\\w+")    
    return(x)
}

compute_similarity <- function(x, seeds, features, subset, ...) {
    e <- substitute(subset)
    r <- eval(e, docvars(x), parent.frame())
    x <- dfm_subset(x, r & !is.na(r))
    lss <- textmodel_lss(x, seeds, cache = TRUE)
    print(divergence(lss))
    beta <- coef(lss)
    return(t(as.matrix(c(beta[features], quantile(beta, c(0.05, 0.95))))))
}

